####################################################################
# Text S2: R Script for Bioinformatics Analysis of Avian Influenza Virus Data
####################################################################
# R version: 4.1.0
# base version: 4.4.0
# Biostrings version: 2.72.0
# bios2mds version: 1.2.3
# dplyr version: 1.1.4
# msa version: 1.36.0
# tidyr version: 1.3.1
# vegan version: 2.5-7

Input Requirements:
# - This script requires the following files in the working directory:
#   1. Input FASTA files (Tawidianetal2024_Text S1.txt) containing sequences to analyze.  
#   Add the novel viruses to this FASTA file through either R or command line. 
#	This FASTA input needs to have sequences trimmed at the open reading frame for each gene segment. 
# 	Subsequently, the analysis can be run with FASTA files having either:
#	(A) whole virus sequences (all gene segments combined/virus)
#	(B) the same gene segment across viruses. 
#   2. Reference table ("Tawidianetal2024_Table S3.xlsx") mapping each virus ID to the GenoFLU genotype.
#   3. The remaining csv files needed will be generated through the script.

library(base)
library(Biostrings)
library(bios2mds)
library(dplyr)
library(tidyr)
library(msa)
library(vegan)

# Assign our working directory by using the function "setwd" as seen below
setwd("")

# Input files 
fasta_file <- "H5N1_References.fasta"
genotype_file <- "H5N1_Reference.csv"

################################### Distance matrix and data visualization
##### 1) Multiple sequence alignment (MSA) 

# Read DNA sequences from a file
Input_FASTA <- readDNAStringSet(fasta_file)

# Perform Multiple Sequence Alignment using Muscle or ClustalW
Alignment <- msa(Input_FASTA, method = "Muscle")

# Export the MSA file as a .fasta file to an output folder
Export_alignment <- msaConvert(alignment_H5N1, 'bios2mds::align')
export.fasta(Export_alignment, outfile = "Alignment.fasta")

# Import the alignment for downstream analysis
Alignment <- bios2mds::import.fasta("Alignment.fasta")

##### 2) Dissimilarity scores and PCA ordination
# Dissimilarity score generation
dissimilarity_distances <- mat.dif(Alignment, Alignment)
write.csv(dissimilarity_distances, "dissimilarity_distances.csv")

# Perform ordination analysis
Ordination <- mmds(dissimilarity_distances)

##### 3) PCA visualization
#### 3.1) Random color assignment to genotypes
# Read the CSV file
data <- read.csv(genotype_file)

# Define a function to generate random color names
generate_color_names <- function(n) {
  available_colors <- colors()  # Get the list of standard color names
  return(sample(available_colors, n, replace = TRUE))
}

# Determine the number of unique genotypes
num_ID <- length(unique(data$Genotype))

# Set the desired number of unique colors
desired_num_colors <- 50  # Adjust as needed

# Check if the desired number of unique colors is feasible
if (num_ID > desired_num_colors) {
  stop("The number of unique genotypes exceeds the desired number of unique colors.")
}

# Create a mapping between genotypes and colors
ID_color_mapping <- data %>%
  distinct(Genotype) %>%
  mutate(Colors = generate_color_names(n())) %>%
  select(Genotype, Colors)

# Merge the original data with the color mapping
data_with_colors <- left_join(data, ID_color_mapping, by = "Genotype")

# Remove column names from the data frame
names(data_with_colors) <- NULL

# Write the modified data to a new CSV file without header row
Colored_genotypes <- "Colored_genotypes.csv"
write.csv(data_with_colors, file = Colored_genotypes, row.names = FALSE)

# Print a message indicating the completion
cat("Colors assigned and CSV file updated successfully. Check", Colored_genotypes, "for the modified data.\n")

#### 3.2) Plotting the PCA with colors assigned
# Attach the colors of the groups to the ordination plot
Colors <- col.group(Ordination, "Colored_genotypes.csv")

### 3.2.1) Plot eigen values
# Place the layout of the plot you will generate
layout(matrix(1:6, 2, 3))

# Plot the eigen values per axes to show which axis explains the most variation
scree.plot(Ordination$eigen.perc, lab = FALSE, title = "Eigenvalues")

### 3.2.2) Plot PCA as a 2D plot
# Create the 2D plot
MDS_Plot <- mmds.2D.plot(Colors, 
                          axis =  c(1, 2),
                          title = "H5N1 genotypes", 
                          active.pch = 20, 
                          active.cex = 2, 
                          active.legend.cex = 1.5,
                          active.legend.text = 0.53, 
                          grid = FALSE, 
                          box.lwd = 0.1)

#### 4) K-means clustering
#### 4.1) Run a silhouette analysis to determine the best number of clusters in your ordination. 
# Retrieve the coordinates from the ordination analysis above that are of interest.
Coordinates <- Ordination$coord %>% select(-PC3)

# Perform silhouette analysis
Sil <- sil.score(Coordinates, nb.clus = c(2:10), nb.run = 100, iter.max = 1000,
                 method = "euclidean")
Sil #Pick the cluster that is has a value closer to 1.0. 

#### 4.2) Run k-means clustering based on silhouette score.
Kmeans <- kmeans(Coordinates, centers=7, iter.max=50, nstart=10) # Example: centers=X, specify the number of clusters
write.csv(Kmeans$cluster, "Clusters.csv") # The output for kmeans_clusters should show a score for "between_SS / total_SS". Score between to 90-100% means your clustering is performed well with the iterations. 